# -*- coding:utf-8 -*-
import re
import urllib
import urllib.request
import gzip
import http.cookiejar
import io
import sys
# gb18030
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# 解压


def ungzip(data):
    try:
        print('正在解压。。。。')
        data = gzip.decompress(data)
        print('解压完毕')
    except:
        print('未经压缩，无需解压')
    return data.decode('utf-8')

# 获取xsrf


def getXSRF(data):
    cer = re.compile('name="_xsrf" value="(.*)"', flags=0)
    strlist = cer.findall(data)
    return strlist[0]

# 封装好请求头


def getOpener(head):
    # deal with the Cookies
    # 声明一个CookieJar对象实例来保存cookie
    cj = http.cookiejar.CookieJar()
    # HTTPCookieProcessor对象来创建cookie处理器
    pro = urllib.request.HTTPCookieProcessor(cj)
    # 通过handler来构建opener
    opener = urllib.request.build_opener(pro)
    header = []
    for key, value in head.items():
        elem = (key, value)
        header.append(elem)
    opener.addheaders = header
    return opener

# 保存


def saveFile(data):
    data = data.encode('utf-8')
    save_path = 'J:\\Python\\Demo\\temp.out'
    f_obj = open(save_path, 'wb')  # wb表示打开方式
    f_obj.write(data)
    f_obj.close()


# 请求头值
header = {
    'Connection': 'Keep-alive',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
    'Accept-Encoding': 'gzip,deflate',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
    'Host': 'www.qiushibaike.com'
}


page = 1
url = 'http://www.qiushibaike.com/hot/'
# _xsrf = getXSRF(data.decode())


try:
    # 封装请求头
    opener = getOpener(header)
    # 建立链接
    op = opener.open(url)
    # 获取数据
    data = op.read()
    # 解压
    data = ungzip(data)
    # op = urllib.request.urlopen(url)
    strRex = ('<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<div class="articleGender.*?">(.*?)</div>' +
              '.*?<div class="content">(.*?)</div>(.*?)<div class="stats.*?class="number">(.*?)</i>')
    pattern = re.compile(strRex, re.S)
    items = re.findall(pattern, data)
    strS = '<li>\n<a href=".*?s=(.*?)".*?>'
    pattS = re.compile(strS, re.S)
    # 这个方法只能匹配简单的字符串，若想使用分组，要是得使用re.compile()方法来编译
    # s = re.search(strS, data) 、

    s = re.search(pattS, data)
    print(s.group(1))
    s = s.group(1)
    page = 2
    url += 'page/' + str(page) + '?s=' + str(s)
    # 建立链接
    op = opener.open(url)
    # 获取数据
    data1 = op.read()
    # 解压
    data1 = ungzip(data1)
    # print(url)
    for item in items:
        print(item[0] + item[1] + item[2] + item[3])
    # print(items)
    # saveFile(''.join(str(e) for e in items))#正确的代码
    saveFile(data1)
except Exception as e:
    print(e)
